importing Libraries

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
import pickle
from tqdm import tqdm
import os
from prettytable import PrettyTable
import csv
import math
from chart_studio import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
In [2]:
data_old=pd.read_csv('train_data.csv')
data_old.head(2)
Out[2]:
Unnamed: 0 id teacher_id teacher_prefix school_state project_submitted_datetime project_grade_category project_subject_categories project_subject_subcategories project_title project_essay_1 project_essay_2 project_essay_3 project_essay_4 project_resource_summary teacher_number_of_previously_posted_projects project_is_approved
0 160221 p253737 c90749f5d961ff158d4b4d1e7dc665fc Mrs. IN 2016-12-05 13:43:57 Grades PreK-2 Literacy & Language ESL, Literacy Educational Support for English Learners at Home My students are English learners that are work... \"The limits of your language are the limits o... NaN NaN My students need opportunities to practice beg... 0 0
1 140945 p258326 897464ce9ddc600bced1151f324dd63a Mr. FL 2016-10-25 09:22:10 Grades 6-8 History & Civics, Health & Sports Civics & Government, Team Sports Wanted: Projector for Hungry Learners Our students arrive to our school eager to lea... The projector we need for our school is very c... NaN NaN My students need a projector to help with view... 7 1
In [3]:
data=pd.read_csv("preprocessed_data.csv")
data.head()
Out[3]:
school_state teacher_prefix project_grade_category teacher_number_of_previously_posted_projects project_is_approved clean_categories clean_subcategories essay price
0 ca mrs grades_prek_2 53 1 math_science appliedsciences health_lifescience i fortunate enough use fairy tale stem kits cl... 725.05
1 ut ms grades_3_5 4 1 specialneeds specialneeds imagine 8 9 years old you third grade classroo... 213.03
2 ca mrs grades_prek_2 10 1 literacy_language literacy having class 24 students comes diverse learner... 329.00
3 ga mrs grades_prek_2 2 1 appliedlearning earlydevelopment i recently read article giving students choice... 481.04
4 wa mrs grades_3_5 2 1 literacy_language literacy my students crave challenge eat obstacles brea... 17.74
In [4]:
data['project_title'] = data_old['project_title'].values
data.head(2)
Out[4]:
school_state teacher_prefix project_grade_category teacher_number_of_previously_posted_projects project_is_approved clean_categories clean_subcategories essay price project_title
0 ca mrs grades_prek_2 53 1 math_science appliedsciences health_lifescience i fortunate enough use fairy tale stem kits cl... 725.05 Educational Support for English Learners at Home
1 ut ms grades_3_5 4 1 specialneeds specialneeds imagine 8 9 years old you third grade classroo... 213.03 Wanted: Projector for Hungry Learners

Sentiment Analysis:

In [5]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent_int = SentimentIntensityAnalyzer()
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sarav\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
C:\Users\sarav\New folder\lib\site-packages\nltk\twitter\__init__.py:20: UserWarning:

The twython library has not been installed. Some functionality from the twitter package will not be available.

In [6]:
# https://analyticsindiamag.com/sentiment-analysis-made-easy-using-vader/#:~:text=The%20compound%20score%20is%20the,%25%20Negative%2C%2050.8%25%20Neutral.

negative = []
positive = []
neutral = []
compound = []

def update_sentiments(values):
    negative.append(values["neg"])    
    positive.append(values["pos"])
    neutral.append(values["neu"])
    compound.append(values["compound"])
In [7]:
from tqdm import tqdm
for essay in tqdm(data["essay"]):
    update_sentiments(sent_int.polarity_scores(essay))
100%|█████████████████████████████████████████████████████████████████████████| 109248/109248 [11:22<00:00, 160.06it/s]
In [8]:
data["negative"] = negative
data["positive"] = positive
data["neutral"] = neutral
data["compound"] = compound
In [9]:
data.head()
Out[9]:
school_state teacher_prefix project_grade_category teacher_number_of_previously_posted_projects project_is_approved clean_categories clean_subcategories essay price project_title negative positive neutral compound
0 ca mrs grades_prek_2 53 1 math_science appliedsciences health_lifescience i fortunate enough use fairy tale stem kits cl... 725.05 Educational Support for English Learners at Home 0.013 0.205 0.783 0.9867
1 ut ms grades_3_5 4 1 specialneeds specialneeds imagine 8 9 years old you third grade classroo... 213.03 Wanted: Projector for Hungry Learners 0.072 0.248 0.680 0.9897
2 ca mrs grades_prek_2 10 1 literacy_language literacy having class 24 students comes diverse learner... 329.00 Soccer Equipment for AWESOME Middle School Stu... 0.017 0.262 0.721 0.9860
3 ga mrs grades_prek_2 2 1 appliedlearning earlydevelopment i recently read article giving students choice... 481.04 Techie Kindergarteners 0.030 0.187 0.783 0.9524
4 wa mrs grades_3_5 2 1 literacy_language literacy my students crave challenge eat obstacles brea... 17.74 Interactive Math Tools 0.029 0.288 0.683 0.9873
In [10]:
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]
In [11]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
In [12]:
from tqdm import tqdm
import re
def preprocess_text(text_data):
    preprocessed_text = []
    # tqdm is for printing the status bar
    for sentance in tqdm(text_data):
        sent = decontracted(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\n', ' ')
        sent = sent.replace('\\"', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        # https://gist.github.com/sebleier/554280
        sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text
In [13]:
data['clean_project_title']=preprocess_text(data['project_title'].values)
100%|███████████████████████████████████████████████████████████████████████| 109248/109248 [00:02<00:00, 38359.72it/s]
In [14]:
data.drop(['project_title'],axis='columns',inplace=True)
data.head()
Out[14]:
school_state teacher_prefix project_grade_category teacher_number_of_previously_posted_projects project_is_approved clean_categories clean_subcategories essay price negative positive neutral compound clean_project_title
0 ca mrs grades_prek_2 53 1 math_science appliedsciences health_lifescience i fortunate enough use fairy tale stem kits cl... 725.05 0.013 0.205 0.783 0.9867 educational support english learners home
1 ut ms grades_3_5 4 1 specialneeds specialneeds imagine 8 9 years old you third grade classroo... 213.03 0.072 0.248 0.680 0.9897 wanted projector hungry learners
2 ca mrs grades_prek_2 10 1 literacy_language literacy having class 24 students comes diverse learner... 329.00 0.017 0.262 0.721 0.9860 soccer equipment awesome middle school students
3 ga mrs grades_prek_2 2 1 appliedlearning earlydevelopment i recently read article giving students choice... 481.04 0.030 0.187 0.783 0.9524 techie kindergarteners
4 wa mrs grades_3_5 2 1 literacy_language literacy my students crave challenge eat obstacles brea... 17.74 0.029 0.288 0.683 0.9873 interactive math tools

50K Data is used here

In [15]:
X=data[:50000]
y=X['project_is_approved'].values
#data.drop(['project_is_approved'],axis='columns',inplace=True)

Splitting dataset:

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y,random_state=42) 
print(len(X_train))
print(len(X_test))
33500
16500

TFIDF VECTORIZATION

In [17]:
vect_tfidf_1 = TfidfVectorizer(min_df=10, ngram_range=(1,4), max_features=5000)
def TFIDF_Vectorization(train_column, test_column):
    vect_tfidf_1.fit(train_column)
    X_train_tfidf = vect_tfidf_1.transform(train_column)
    X_test_tfidf = vect_tfidf_1.transform(test_column)
    return (X_train_tfidf, X_test_tfidf)
In [18]:
train1 = X_train['essay'].values
test1=X_test['essay'].values
X_train_essay,X_test_essay = tqdm(TFIDF_Vectorization(train1,test1))
print("After vectorizations")
print(X_train_essay.shape, y_train.shape)
print(X_test_essay.shape, y_test.shape)
100%|████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]
After vectorizations
(33500, 5000) (33500,)
(16500, 5000) (16500,)

In [19]:
train2 = X_train['clean_project_title'].values
test2=X_test['clean_project_title'].values
X_train_tit,X_test_tit = tqdm(TFIDF_Vectorization(train2,test2))
print("After vectorizations")
print(X_train_tit.shape, y_train.shape)
print(X_test_tit.shape, y_test.shape)
100%|████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]
After vectorizations
(33500, 2443) (33500,)
(16500, 2443) (16500,)

TFIDF W2V

In [20]:
###http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/

import pickle
with open (r'glove_vectors', "rb") as f:
    model = pickle.load(f)
    glove_words = set(model.keys())

TFIDF W2V on Train data

In [21]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(X_train['essay'])
tfidf_feat=tfidf_model.get_feature_names()
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
In [82]:
## https://medium.com/analytics-vidhya/featurization-of-text-data-bow-tf-idf-avgw2v-tfidf-weighted-w2v-7a6c62e8b097#:~:text=TFIDF%20weighted%20Word2Vec,sum%20by%20sum%20tfidf%20value.
from scipy import sparse
train_tfidf_w2v_vectors = [];
for sentence in tqdm(X_train['essay']):
    vector = np.zeros(300)
    tf_idf_weight =0;
    for word in sentence.split(): 
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] 
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
            vector += (vec * tf_idf) 
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    train_tfidf_w2v_vectors.append(vector)
### https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
train_tfidf_w2v_vectors=sparse.csr_matrix(train_tfidf_w2v_vectors)

print((train_tfidf_w2v_vectors.shape))
100%|███████████████████████████████████████████████████████████████████████████| 33500/33500 [03:23<00:00, 164.89it/s]
(33500, 300)

TFIDF W2V on Test data

In [84]:
test_tfidf_w2v_vectors = []; 
for sentence in tqdm(X_test['essay']):
    vector = np.zeros(300)
    tf_idf_weight =0;
    for word in sentence.split(): 
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] 
            
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) 
            vector += (vec * tf_idf) 
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    test_tfidf_w2v_vectors.append(vector)
## https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
test_tfidf_w2v_vectors=sparse.csr_matrix(test_tfidf_w2v_vectors)

print((test_tfidf_w2v_vectors.shape))
100%|███████████████████████████████████████████████████████████████████████████| 16500/16500 [01:17<00:00, 214.27it/s]
(16500, 300)
In [85]:
tfidf_model1 = TfidfVectorizer()
tfidf_model1.fit(X_train['clean_project_title'])
tfidf_feat=tfidf_model1.get_feature_names()
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model1.get_feature_names(), list(tfidf_model1.idf_)))
tfidf_words = set(tfidf_model1.get_feature_names())
In [87]:
train_tfidf_w2v_vec = [];
for sentence in tqdm(X_train['clean_project_title']):
    vector = np.zeros(300)
    tf_idf_weight =0;
    for word in sentence.split(): 
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] 
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
            vector += (vec * tf_idf) 
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    train_tfidf_w2v_vec.append(vector)
## https://machinelearningmastery.com/sparse-matrices-for-machine-learning/    
train_tfidf_w2v_vec= sparse.csr_matrix(train_tfidf_w2v_vec)

print(train_tfidf_w2v_vec.shape)
100%|██████████████████████████████████████████████████████████████████████████| 33500/33500 [00:03<00:00, 9797.01it/s]
(33500, 300)
In [89]:
test_tfidf_w2v_vec = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_test['clean_project_title']):
    vector = np.zeros((300))
    tf_idf_weight =0;
    for word in sentence.split(): 
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] 
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) 
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    test_tfidf_w2v_vec.append(vector)
## https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
test_tfidf_w2v_vec=sparse.csr_matrix(test_tfidf_w2v_vec)

print(test_tfidf_w2v_vec.shape)
100%|█████████████████████████████████████████████████████████████████████████| 16500/16500 [00:01<00:00, 12358.15it/s]
(16500, 300)

Encoding catgorical data(Response Coding):

In [64]:
## https://stackoverflow.com/questions/66122577/response-coding-for-categorical-data

def response_coding(xtrain, ytrain, feature):  
    dicary = dict()
    labels = xtrain[feature].unique()

    for i in tqdm(range(len(labels))):
        total_count = xtrain.loc[:,feature][(xtrain[feature] == labels[i])].count()
        p_0 = xtrain.loc[:, feature][((xtrain[feature] == labels[i]) & (ytrain==0))].count()
        p_1 = xtrain.loc[:, feature][((xtrain[feature] == labels[i]) & (ytrain==1))].count()

        dicary[labels[i]] = [p_1/total_count, p_0/total_count]

    return dicary
# For train set
def transform(feature, df ):
    diction = response_coding(X_train,y_train,feature)
    count_val = df[feature].value_counts()
    f_list=[]
    for c in df[feature]:
        if c in dict( count_val ).keys():# transform test data with trainning probabilities
            f_list.append( diction[c] )
        else:
            f_list.append([0.5, 0.5])
    return f_list
# For test set
def transform1(feature, df ):
    diction = response_coding(X_test,y_test,feature)
    count_val = df[feature].value_counts()
    f_list=[]
    for c in df[feature]:
        if c in dict( count_val ).keys():# transform test data with trainning probabilities
            f_list.append( diction[c] )
        else:
            f_list.append([0.5, 0.5])
    return f_list

School_state:

In [65]:
X_train_state_transform = transform('school_state',X_train)
X_test_state_transform = transform1('school_state',X_test)
X_train_state_transform = np.array(X_train_state_transform)
X_test_state_transform = np.array(X_test_state_transform)
print("After vectorizations")
print(X_train_state_transform.shape, y_train.shape)
print(X_test_state_transform.shape, y_test.shape)
100%|██████████████████████████████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 67.39it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 151.62it/s]
After vectorizations
(33500, 2) (33500,)
(16500, 2) (16500,)

Teacher_prefix:

In [66]:
X_train_prefix_transform = transform('teacher_prefix',X_train)
X_test_prefix_transform = transform1('teacher_prefix',X_test)
X_train_prefix_transform = np.array(X_train_prefix_transform)
X_test_prefix_transform = np.array(X_test_prefix_transform)
print("After vectorizations")
print(X_train_prefix_transform.shape, y_train.shape)
print(X_test_prefix_transform.shape, y_test.shape)
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 61.20it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 100.76it/s]
After vectorizations
(33500, 2) (33500,)
(16500, 2) (16500,)

Project_grade_category:

In [67]:
X_train_grade_transform = transform('project_grade_category',X_train)
X_test_grade_transform = transform1('project_grade_category',X_test)
X_train_grade_transform = np.array(X_train_grade_transform)
X_test_grade_transform = np.array(X_test_grade_transform)
print("After vectorizations")
print(X_train_grade_transform.shape, y_train.shape)
print(X_test_grade_transform.shape, y_test.shape)
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 44.64it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 87.51it/s]
After vectorizations
(33500, 2) (33500,)
(16500, 2) (16500,)

Clean_category:

In [68]:
X_train_categories_transform = transform('clean_categories',X_train)
X_test_categories_transform = transform1('clean_categories',X_test)
X_train_categories_transform = np.array(X_train_categories_transform)
X_test_categories_transform = np.array(X_test_categories_transform)
print("After vectorizations")
print(X_train_categories_transform.shape, y_train.shape)
print(X_test_categories_transform.shape, y_test.shape)
100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 56.92it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 154.51it/s]
After vectorizations
(33500, 2) (33500,)
(16500, 2) (16500,)

Clean_subcategory:

In [69]:
X_train_sub_transform = transform('clean_subcategories',X_train)
X_test_sub_transform = transform1('clean_subcategories',X_test)
X_train_sub_transform = np.array(X_train_sub_transform)
X_test_sub_transform = np.array(X_test_sub_transform)
print("After vectorizations")
print(X_train_sub_transform.shape, y_train.shape)
print(X_test_sub_transform.shape, y_test.shape)
100%|████████████████████████████████████████████████████████████████████████████████| 337/337 [00:04<00:00, 82.83it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 311/311 [00:02<00:00, 133.75it/s]
After vectorizations
(33500, 2) (33500,)
(16500, 2) (16500,)

Encoding numerical data:

In [70]:
def numerical_data(train_value,test_value):
    normalizer=Normalizer()
    normalizer.fit(train_value)
    X_train_transform_norm=normalizer.transform(train_value)
    X_test_transform_norm=normalizer.transform(test_value)
    return X_train_transform_norm,X_test_transform_norm

Price:

In [71]:
train7=X_train['price'].values.reshape(-1,1)
test7=X_test['price'].values.reshape(-1,1)
X_train_price_transform,X_test_price_transform = numerical_data(train7,test7)
print("After vectorizations")
print(X_train_price_transform.shape, y_train.shape)
print(X_test_price_transform.shape, y_test.shape)
After vectorizations
(33500, 1) (33500,)
(16500, 1) (16500,)

teacher_number_of_previously_posted_projects:

In [72]:
train8=X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1)
test8=X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1,1)
X_train_prev_transform,X_test_prev_transform = numerical_data(train8,test8)
print("After vectorizations")
print(X_train_prev_transform.shape, y_train.shape)
print(X_test_prev_transform.shape, y_test.shape)
After vectorizations
(33500, 1) (33500,)
(16500, 1) (16500,)

Negative sentiment:

In [73]:
train9=X_train['negative'].values.reshape(-1,1)
test9=X_test['negative'].values.reshape(-1,1)
X_train_neg,X_test_neg = numerical_data(train9,test9)
print("After vectorizations")
print(X_train_neg.shape, y_train.shape)
print(X_test_neg.shape, y_test.shape)
After vectorizations
(33500, 1) (33500,)
(16500, 1) (16500,)

positive sentiment:

In [74]:
train10=X_train['positive'].values.reshape(-1,1)
test10=X_test['positive'].values.reshape(-1,1)
X_train_pos,X_test_pos = numerical_data(train10,test10)
print("After vectorizations")
print(X_train_pos.shape, y_train.shape)
print(X_test_pos.shape, y_test.shape)
After vectorizations
(33500, 1) (33500,)
(16500, 1) (16500,)

Neutral sentiment:

In [75]:
train11=X_train['neutral'].values.reshape(-1,1)
test11=X_test['neutral'].values.reshape(-1,1)
X_train_neu,X_test_neu = numerical_data(train11,test11)
print("After vectorizations")
print(X_train_neu.shape, y_train.shape)
print(X_test_neu.shape, y_test.shape)
After vectorizations
(33500, 1) (33500,)
(16500, 1) (16500,)

Compound of sentiment:

In [76]:
train12=X_train['compound'].values.reshape(-1,1)
test12=X_test['compound'].values.reshape(-1,1)
X_train_compound,X_test_compound = numerical_data(train12,test12)
print("After vectorizations")
print(X_train_compound.shape, y_train.shape)
print(X_test_compound.shape, y_test.shape)
After vectorizations
(33500, 1) (33500,)
(16500, 1) (16500,)

Stacking of features:

In [90]:
from scipy.sparse import hstack
X_train_set1=hstack((X_train_state_transform, \
                   X_train_prefix_transform, \
                   X_train_grade_transform, \
                   X_train_categories_transform, \
                   X_train_sub_transform, \
                   X_train_price_transform, \
                   X_train_prev_transform, \
                   X_train_essay, \
                   X_train_compound, \
                    X_train_neu, \
                    X_train_pos, \
                    X_train_neg, \
                    X_train_tit)).tocsr()
X_test_set1=hstack((X_test_state_transform, \
                   X_test_prefix_transform, \
                   X_test_grade_transform, \
                   X_test_categories_transform, \
                   X_test_sub_transform, \
                   X_test_price_transform, \
                   X_test_prev_transform, \
                   X_test_essay, \
                   X_test_compound, \
                   X_test_pos, \
                   X_test_neg, \
                   X_test_neu, \
                   X_test_tit)).tocsr()
In [109]:
from scipy.sparse import hstack
X_train_set2=hstack((X_train_state_transform, \
                   X_train_prefix_transform, \
                   X_train_grade_transform, \
                   X_train_categories_transform, \
                   X_train_sub_transform, \
                   X_train_price_transform, \
                   X_train_prev_transform, \
                    train_tfidf_w2v_vec, \
                    train_tfidf_w2v_vectors)).tocsr()
X_test_set2=hstack((X_test_state_transform, \
                   X_test_prefix_transform, \
                   X_test_grade_transform, \
                   X_test_categories_transform, \
                   X_test_sub_transform, \
                   X_test_price_transform, \
                   X_test_prev_transform, \
                   test_tfidf_w2v_vec, \
                   test_tfidf_w2v_vectors)).tocsr()

Model Implementation:

In [92]:
!pip install lightgbm
Requirement already satisfied: lightgbm in c:\users\sarav\new folder\lib\site-packages (3.3.2)
Requirement already satisfied: scipy in c:\users\sarav\new folder\lib\site-packages (from lightgbm) (1.1.0)
Requirement already satisfied: scikit-learn!=0.22.0 in c:\users\sarav\new folder\lib\site-packages (from lightgbm) (0.24.2)
Requirement already satisfied: wheel in c:\users\sarav\new folder\lib\site-packages (from lightgbm) (0.37.1)
Requirement already satisfied: numpy in c:\users\sarav\new folder\lib\site-packages (from lightgbm) (1.19.5)
Requirement already satisfied: joblib>=0.11 in c:\users\sarav\new folder\lib\site-packages (from scikit-learn!=0.22.0->lightgbm) (1.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\sarav\new folder\lib\site-packages (from scikit-learn!=0.22.0->lightgbm) (3.1.0)
In [110]:
from lightgbm import LGBMClassifier
parameters = {"max_depth":[1,2,5,10],"n_estimators":[5,10,100,500] }
clf = GridSearchCV(LGBMClassifier(), parameters, cv=5,  scoring='roc_auc',return_train_score=True,n_jobs=-1)
clf.fit(X_train_set1,y_train)
Out[110]:
GridSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 5, 10],
                         'n_estimators': [5, 10, 100, 500]},
             return_train_score=True, scoring='roc_auc')

Best parameters:

In [111]:
trainauc= clf.cv_results_['mean_train_score']
trainaucstd= clf.cv_results_['std_train_score']
cvauc = clf.cv_results_['mean_test_score'] 
cvaucstd= clf.cv_results_['std_test_score']
print('Best score: ',clf.best_score_)
print('Best Hyper parameters: ',clf.best_params_)
Best score:  0.7034580973673894
Best Hyper parameters:  {'max_depth': 2, 'n_estimators': 500}

Heat Maps:

In [95]:
## https://www.geeksforgeeks.org/pandas-groupby-unstack/
## https://indianaiproduction.com/seaborn-heatmap/
## https://stackoverflow.com/questions/34162443/why-do-many-examples-use-fig-ax-plt-subplots-in-matplotlib-pyplot-python

param_max_depth= [1, 2, 5, 10]
param_n_estimators = [5, 10, 100, 500]

scores1 = pd.DataFrame(clf.cv_results_).groupby(['param_n_estimators', 'param_max_depth']).max().unstack()[['mean_test_score', 'mean_train_score']]
fig,ax = plt.subplots(1,2, figsize=(20,5))

sns.heatmap(scores1.mean_train_score, annot = True, fmt='.4g', ax=ax[0],cmap='coolwarm')
sns.heatmap(scores1.mean_test_score, annot = True, fmt='.4g', ax=ax[1],cmap='coolwarm')

ax[0].set_title('Train Set')
ax[1].set_title('Test Set')
plt.show()

ROC-AUC Curve:

In [102]:
lgb1 = LGBMClassifier(class_weight ="balanced",max_depth=2,n_estimators=500)
lgb1.fit(X_train_set1,y_train)


pred_ytrain1 = lgb1.predict_proba(X_train_set1) [:,1]
pred_ytest1 = lgb1.predict_proba(X_test_set1) [:,1]


trfpr1, trtpr1, trthres1 = roc_curve(y_train, pred_ytrain1)
tfpr1, ttpr1, tthres1 = roc_curve(y_test, pred_ytest1)

plt.plot(trfpr1, trtpr1, label="train AUC ="+str(auc(trfpr1,trtpr1)))
plt.plot(tfpr1, ttpr1, label="test AUC ="+str(auc(tfpr1,ttpr1)))
plt.legend()
plt.grid()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ERROR PLOTS")
plt.show()

Confusion Matrix:

In [97]:
def best_threshold(threshold, fpr, tpr):
    th = threshold[np.argmax(tpr*(1-fpr))]
    print("the maximum value", max(tpr*(1-fpr)), "for threshold", np.round(th,3))
    return th
def predict(prob, threshold):
    predictions = []
    for i in prob:
        if i>=threshold:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions
In [98]:
best_thres = best_threshold(trthres1, trfpr1, trtpr1)
conf_mat1=metrics.confusion_matrix(y_train,predict(pred_ytrain1,best_thres))

print("CONFUSION MATRIX OF TRAIN DATA")
print(conf_mat1)
sns.heatmap(conf_mat1, annot=True, fmt='d',cmap='GnBu')
the maximum value 0.42355340737500646 for threshold 0.489
CONFUSION MATRIX OF TRAIN DATA
[[ 3358  2007]
 [ 9096 19039]]
Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b1194a1978>
In [99]:
best_thres1 = best_threshold(tthres1, tfpr1, ttpr1)
conf_mat2=metrics.confusion_matrix(y_test,predict(pred_ytest1,best_thres1))

print("CONFUSION MATRIX OF Test DATA")
print(conf_mat2)
sns.heatmap(conf_mat2, annot=True, fmt='d',cmap='GnBu')
the maximum value 0.4035140572011411 for threshold 0.495
CONFUSION MATRIX OF Test DATA
[[1621 1021]
 [4744 9114]]
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b1476eb630>

SET 2:

Model implementation:

In [112]:
from lightgbm import LGBMClassifier
parameters = {"max_depth":[1,2,5,10],"n_estimators":[5,10,100,500] }
clf1 = GridSearchCV(LGBMClassifier(), parameters, cv=5,  scoring='roc_auc',return_train_score=True,n_jobs=-1)
clf1.fit(X_train_set2,y_train)
Out[112]:
GridSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 5, 10],
                         'n_estimators': [5, 10, 100, 500]},
             return_train_score=True, scoring='roc_auc')

Best parameters:

In [113]:
trainauc= clf1.cv_results_['mean_train_score']
trainaucstd= clf1.cv_results_['std_train_score']
cvauc = clf1.cv_results_['mean_test_score'] 
cvaucstd= clf1.cv_results_['std_test_score']
print('Best score: ',clf1.best_score_)
print('Best Hyper parameters: ',clf1.best_params_)
Best score:  0.6808295644203797
Best Hyper parameters:  {'max_depth': 2, 'n_estimators': 500}

Heat map:

In [114]:
## https://www.geeksforgeeks.org/pandas-groupby-unstack/
## https://indianaiproduction.com/seaborn-heatmap/
## https://stackoverflow.com/questions/34162443/why-do-many-examples-use-fig-ax-plt-subplots-in-matplotlib-pyplot-python

param_max_depth= [1, 2, 5, 10]
param_n_estimators = [5, 10, 100, 500]

scores2 = pd.DataFrame(clf1.cv_results_).groupby(['param_n_estimators', 'param_max_depth']).max().unstack()[['mean_test_score', 'mean_train_score']]
fig,ax = plt.subplots(1,2, figsize=(20,5))

sns.heatmap(scores2.mean_train_score, annot = True, fmt='.4g', ax=ax[0],cmap='coolwarm')
sns.heatmap(scores2.mean_test_score, annot = True, fmt='.4g', ax=ax[1],cmap='coolwarm')

ax[0].set_title('Train Set')
ax[1].set_title('Test Set')
plt.show()

ROC_AUC plot:

In [115]:
lgb2 = LGBMClassifier(class_weight ="balanced",max_depth=2,n_estimators=500)
lgb2.fit(X_train_set2,y_train)


pred_ytrain2 = lgb2.predict_proba(X_train_set2) [:,1]
pred_ytest2 = lgb2.predict_proba(X_test_set2) [:,1]


trfpr2, trtpr2, trthres2 = roc_curve(y_train, pred_ytrain2)
tfpr2, ttpr2, tthres2 = roc_curve(y_test, pred_ytest2)

plt.plot(trfpr2, trtpr2, label="train AUC ="+str(auc(trfpr2,trtpr2)))
plt.plot(tfpr2, ttpr2, label="test AUC ="+str(auc(tfpr2,ttpr2)))
plt.legend()
plt.grid()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ERROR PLOTS")
plt.show()

Confusion Matrix:

In [116]:
best_thres2 = best_threshold(trthres2, trfpr2, trtpr2)
conf_mat3=metrics.confusion_matrix(y_train,predict(pred_ytrain2,best_thres2))

print("CONFUSION MATRIX OF TRAIN DATA")
print(conf_mat3)
sns.heatmap(conf_mat3, annot=True, fmt='d',cmap='GnBu')
the maximum value 0.5367206142796738 for threshold 0.5
CONFUSION MATRIX OF TRAIN DATA
[[ 4033  1332]
 [ 8047 20088]]
Out[116]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b127515828>
In [117]:
best_thres3 = best_threshold(tthres2,tfpr2, ttpr2)
conf_mat4=metrics.confusion_matrix(y_test,predict(pred_ytest2,best_thres3))

print("CONFUSION MATRIX OF TRAIN DATA")
print(conf_mat4)
sns.heatmap(conf_mat4, annot=True, fmt='d',cmap='GnBu')
the maximum value 0.4296263747501013 for threshold 0.514
CONFUSION MATRIX OF TRAIN DATA
[[1680  962]
 [4495 9363]]
Out[117]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b137b95550>

Observations:

In [118]:
# http://zetcode.com/python/prettytable/

from prettytable import PrettyTable

x = PrettyTable()
x.field_names = ["Vectorizer and Encoding", "Model", "Hyperparameters(max depth,min samples split)", "Test AUC"]

x.add_row(["TFIDF + Response Coding ", "LightGBM", "(2, 500)", 0.703])
x.add_row(["TFIDF W2V + Response Coding", "LightGBM", "(2, 500)", 0.680])
print(x)
+-----------------------------+----------+----------------------------------------------+----------+
|   Vectorizer and Encoding   |  Model   | Hyperparameters(max depth,min samples split) | Test AUC |
+-----------------------------+----------+----------------------------------------------+----------+
|   TFIDF + Response Coding   | LightGBM |                   (2, 500)                   |  0.703   |
| TFIDF W2V + Response Coding | LightGBM |                   (2, 500)                   |   0.68   |
+-----------------------------+----------+----------------------------------------------+----------+